#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib>

using namespace std;

__global__ void test(float* matrixA, float* matrixB, float* out)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	out[i] = matrixA[i] * matrixA[i] * matrixB[i];
}

int main(int argc, char** argv)
{
	int N = 1024 * 1024;

	dim3 threadDim = (32);
	dim3 gridDim = (N+threadDim.x-1) / threadDim.x;

	float* h_matrixA = NULL;
	float* h_matrixB = NULL;
	float* h_out = NULL;

	float* d_matrixA = NULL;
	float* d_matrixB = NULL;
	float* d_out = NULL;

	h_matrixA = (float*)malloc(sizeof(float) * N);
	h_matrixB = (float*)malloc(sizeof(float) * N);
	h_out = (float*)malloc(sizeof(float) * N);

	for (int i = 0; i < N; ++i)
	{
		h_matrixA[i] = random();
		h_matrixB[i] = random();
		memset(h_out, 0, N);
	}

	cudaError_t error = cudaMalloc(&d_matrixA, sizeof(float) * N);
	error = cudaMalloc(&d_matrixB, sizeof(float) * N);
	error = cudaMalloc(&d_out, sizeof(float) * N);

	cudaMemcpy(d_matrixA, h_matrixA, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_matrixB, h_matrixB, N * sizeof(float), cudaMemcpyHostToDevice);

	test<<<gridDim, threadDim>>>(d_matrixA, d_matrixB, d_out);

	error = cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

	cudaDeviceReset();

	error = cudaFree(d_matrixA);
	error = cudaFree(d_matrixB);
	error = cudaFree(d_out);

	for (int i = 0; i < N; ++i)
	{
		// cout << i << ": " << h_out[i] << endl;
	}

	free(h_matrixA);
	free(h_matrixB);
	free(h_out);

	return 0;
}
